This module extends code contained in Coronavirus_Statistics_v005.Rmd to include sourcing of updated functions and parameters. This file includes the latest code for analyzing all-cause death data from CDC Weekly Deaths by Jurisdiction. CDC maintains data on deaths by week, age cohort, and state in the US. Downloaded data are unique by state, epidemiological week, year, age, and type (actual vs. predicted/projected).
These data are known to have a lag between death and reporting, and the CDC back-correct to report deaths at the time the death occurred even if the death is reported in following weeks. This means totals for recent weeks tend to run low (lag), and the CDC run a projection of the expected total number of deaths given the historical lag times. Per other analysts on the internet, there is currently significant supra-lag, with lag times much longer than historical averages causing CDC projected deaths for recent weeks to be low.
The code leverages tidyverse and sourced functions throughout:
# All functions assume that tidyverse and its components are loaded and available
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# If the same function is in both files, use the version from the more specific source
source("./Generic_Added_Utility_Functions_202105_v001.R")
source("./Coronavirus_CDC_Excess_Functions_v001.R")
The basic process includes three data update steps:
# STEP 0: Appropriate parameters for 2022 data
cdcExcessParams <- list(remapVars=c('Jurisdiction'='fullState',
'Week Ending Date'='weekEnding',
'State Abbreviation'='state',
'Age Group'='age',
'Number of Deaths'='deaths',
'Time Period'='period',
'Year'='year',
'Week'='week'
),
colTypes="ccciicdcccc",
ageLevels=c("Under 25 years",
"25-44 years",
"45-64 years",
"65-74 years",
"75-84 years",
"85 years and older"
),
periodLevels=c("2015-2019", "2020", "2021", "2022"),
periodKeep=c("2015-2019", "2020", "2021"),
yearLevels=2015:2022
)
# STEP 1: Latest CDC all-cause deaths data
cdcLoc <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20220623.csv"
cdcList_20220623 <- readRunCDCAllCause(loc=cdcLoc,
weekThru=21,
lst=readFromRDS("cdc_daily_220602"),
stateNoCheck=c(),
pdfCluster=TRUE,
pdfAge=TRUE
)
##
## Parameter cvDeathThru has been set as: 2022-05-28
##
##
## *** Data suppression checks ***
##
## Rows in states to be checked that have NA deaths or a note for suppression:
## state weekEnding year week age
## 1 SD 2022-04-30 2022 17 65-74 years
## 2 SD 2022-04-30 2022 17 75-84 years
## Suppress deaths
## 1 Suppressed (counts highly incomplete, <50% of expected) NA
## 2 Suppressed (counts highly incomplete, <50% of expected) NA
##
##
## Problems by state:
## # A tibble: 1 x 5
## noCheck state problem n deaths
## <lgl> <chr> <lgl> <int> <dbl>
## 1 FALSE SD TRUE 2 NA
##
##
## There are 2 rows with errors; maximum for any given state is 2 errors
##
##
## Data suppression checks passed
##
##
## *** File has been checked for uniqueness by: state year week age
##
## Rows: 106,840
## Columns: 12
## $ fullState <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala~
## $ weekEnding <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10~
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",~
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015,~
## $ week <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4,~
## $ age <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years, 75-8~
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015~
## $ Type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predicted ~
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ n <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ deaths <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332, 26,~
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
##
## Check Control Levels and Record Counts for Processed Data:
##
##
## Checking variable combination: age
## # A tibble: 6 x 4
## age n n_deaths_na deaths
## <fct> <dbl> <dbl> <dbl>
## 1 Under 25 years 12528 0 434501
## 2 25-44 years 16114 0 1115606
## 3 45-64 years 19554 0 4261157
## 4 65-74 years 19547 0 4306424
## 5 75-84 years 19554 0 5271898
## 6 85 years and older 19543 0 6662410
##
##
## Checking variable combination: period year Type
## # A tibble: 8 x 6
## period year Type n n_deaths_na deaths
## <fct> <fct> <chr> <dbl> <dbl> <dbl>
## 1 2015-2019 2015 Predicted (weighted) 14367 0 2698242
## 2 2015-2019 2016 Predicted (weighted) 14445 0 2725557
## 3 2015-2019 2017 Predicted (weighted) 14408 0 2802070
## 4 2015-2019 2018 Predicted (weighted) 14400 0 2830373
## 5 2015-2019 2019 Predicted (weighted) 14413 0 2843917
## 6 2020 2020 Predicted (weighted) 14834 0 3432792
## 7 2021 2021 Predicted (weighted) 14698 0 3451431
## 8 2022 2022 Predicted (weighted) 5275 0 1267614
##
##
## Checking variable combination: period Suppress
## # A tibble: 4 x 5
## period Suppress n n_deaths_na deaths
## <fct> <chr> <dbl> <dbl> <dbl>
## 1 2015-2019 <NA> 72033 0 13900159
## 2 2020 <NA> 14834 0 3432792
## 3 2021 <NA> 14698 0 3451431
## 4 2022 <NA> 5275 0 1267614
##
##
## Checking variable combination: period Note
## # A tibble: 9 x 5
## period Note n n_deaths_na deaths
## <fct> <chr> <dbl> <dbl> <dbl>
## 1 2015-20~ <NA> 72033 0 1.39e7
## 2 2020 Data in recent weeks are incomplete. Only ~ 279 0 8.68e4
## 3 2020 <NA> 14555 0 3.35e6
## 4 2021 Data in recent weeks are incomplete. Only ~ 12116 0 2.42e6
## 5 2021 Data in recent weeks are incomplete. Only ~ 10 0 2.58e2
## 6 2021 Data in recent weeks are incomplete. Only ~ 2572 0 1.04e6
## 7 2022 Data in recent weeks are incomplete. Only ~ 4347 0 1.06e6
## 8 2022 Data in recent weeks are incomplete. Only ~ 76 0 1.80e4
## 9 2022 Data in recent weeks are incomplete. Only ~ 852 0 1.90e5
##
## *** File has been checked for uniqueness by: cluster year week
##
## Plots will be run after excluding stateNoCheck states
##
## Detailed cluster summary PDF file is available at: ./RInputFiles/Coronavirus/Plots/CDC_cluster_2022w21.pdf
##
## Returning plot outputs to the main log file
## Joining, by = "state"
##
## Detailed age summary PDF file is available at: ./RInputFiles/Coronavirus/Plots/CDC_age_2022w21.pdf
##
## Returning plot outputs to the main log file
saveToRDS(cdcList_20220623, ovrWriteError=FALSE)
# STEP 2: Latest death bu location-cause data
allCause_220623 <- analyzeAllCause(loc="COvID_deaths_age_place_20220623.csv",
cdcDailyList=readFromRDS("cdc_daily_220602"),
compareThruDate="2022-05-31"
)
## `summarise()` has grouped output by 'State'. You can override using the `.groups` argument.
##
## States without abbreviations
## # A tibble: 2 x 10
## # Groups: State [2]
## State abb Year Month covidDeaths totalDeaths pneumoDeaths pneumoCovidDeat~
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 New Y~ <NA> 0 0 35136 170882 22567 13036
## 2 Puert~ <NA> 0 0 4311 78570 11023 3082
## # ... with 2 more variables: fluDeaths <dbl>, pnemoFluCovidDeaths <dbl>
##
## Sub-lists are identical by: asofDate, startDate, endDate, Group, State, deathPlace, Age
## # A tibble: 1,748 x 12
## asofDate startDate endDate Group State deathPlace Age name dfSub
## <date> <date> <date> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 2022-06-02 2020-10-01 2020-10-31 By Mo~ Unite~ Total - All~ 30-3~ pnem~ 205
## 2 2022-06-02 2021-08-01 2021-08-31 By Mo~ Unite~ Other All ~ pneu~ 671
## 3 2022-06-02 2021-10-01 2021-10-31 By Mo~ Unite~ Decedent's ~ 40-4~ pnem~ 149
## 4 2022-06-02 2020-02-01 2020-02-29 By Mo~ Unite~ Total - All~ 30-3~ pnem~ 71
## 5 2022-06-02 2021-11-01 2021-11-30 By Mo~ Unite~ Healthcare ~ 75-8~ pnem~ 139
## 6 2022-06-02 2020-11-01 2020-11-30 By Mo~ Unite~ Total - All~ 30-3~ pneu~ 227
## 7 2022-06-02 2022-04-01 2022-04-30 By Mo~ Unite~ Total - All~ All ~ fluD~ 168
## 8 2022-06-02 2020-08-01 2020-08-31 By Mo~ Unite~ Other 0-17~ tota~ 116
## 9 2022-06-02 2020-09-01 2020-09-30 By Mo~ Unite~ Decedent's ~ 50-6~ pnem~ 190
## 10 2022-06-02 2021-10-01 2021-10-31 By Mo~ Unite~ Decedent's ~ 65-7~ pneu~ 86
## # ... with 1,738 more rows, and 3 more variables: dfTot <dbl>, delta <dbl>,
## # pct <dbl>
##
## Sub-lists are identical by: asofDate, startDate, endDate, Group, State, deathPlace, Age
## # A tibble: 0 x 12
## # ... with 12 variables: asofDate <date>, startDate <date>, endDate <date>,
## # Group <chr>, State <chr>, deathPlace <chr>, Age <chr>, name <chr>,
## # dfSub <dbl>, dfTot <dbl>, delta <dbl>, pct <dbl>
##
## Sub-lists are identical by: asofDate, startDate, endDate, Group, State, deathPlace, Age
## # A tibble: 0 x 12
## # ... with 12 variables: asofDate <date>, startDate <date>, endDate <date>,
## # Group <chr>, State <chr>, deathPlace <chr>, Age <chr>, name <chr>,
## # dfSub <dbl>, dfTot <dbl>, delta <dbl>, pct <dbl>
## # A tibble: 51 x 4
## abb cumValue tot_deaths pctdiff
## <chr> <dbl> <dbl> <dbl>
## 1 NY 36518 68346 0.304
## 2 DC 2010 1343 0.199
## 3 ND 2777 2283 0.0976
## 4 NC 28931 24660 0.0797
## 5 GA 32614 38198 0.0789
## 6 WY 1577 1820 0.0715
## 7 NE 4947 4290 0.0711
## 8 OH 43659 38628 0.0611
## 9 MI 32215 36357 0.0604
## 10 OK 16139 14420 0.0563
## # ... with 41 more rows
## # A tibble: 1 x 3
## cumValue tot_deaths pctdiff
## <dbl> <dbl> <dbl>
## 1 969868 997512 1.82
saveToRDS(allCause_220623, ovrWriteError=FALSE)
# STEP 3: Facets for excess all-cause deaths
excessDeathFacets(lstCDC=cdcList_20220623, lstAll=allCause_220623, dateThru="2022-04-30", plotYLim=c(-200, 1200))
Updated with the latest data:
# STEP 1: Latest CDC all-cause deaths data
cdcLoc <- "Weekly_counts_of_deaths_by_jurisdiction_and_age_group_downloaded_20220713.csv"
cdcList_20220713 <- readRunCDCAllCause(loc=cdcLoc,
weekThru=24,
lst=readFromRDS("cdc_daily_220704"),
stateNoCheck=c(),
pdfCluster=TRUE,
pdfAge=TRUE
)
##
## Parameter cvDeathThru has been set as: 2022-06-18
##
##
## *** Data suppression checks ***
##
## Rows in states to be checked that have NA deaths or a note for suppression:
## [1] state weekEnding year week age Suppress deaths
## <0 rows> (or 0-length row.names)
##
##
## Problems by state:
## # A tibble: 0 x 5
## # ... with 5 variables: noCheck <lgl>, state <chr>, problem <lgl>, n <int>,
## # deaths <dbl>
## Warning in max(.): no non-missing arguments to max; returning -Inf
##
##
## There are 0 rows with errors; maximum for any given state is -Inf errors
##
##
## Data suppression checks passed
##
##
## *** File has been checked for uniqueness by: state year week age
##
## Rows: 108,099
## Columns: 12
## $ fullState <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala~
## $ weekEnding <date> 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10, 2015-01-10~
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",~
## $ year <fct> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015,~
## $ week <int> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4,~
## $ age <fct> Under 25 years, 25-44 years, 45-64 years, 65-74 years, 75-8~
## $ period <fct> 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015-2019, 2015~
## $ Type <chr> "Predicted (weighted)", "Predicted (weighted)", "Predicted ~
## $ Suppress <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ n <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ deaths <dbl> 25, 67, 253, 202, 272, 320, 28, 49, 256, 222, 253, 332, 26,~
## $ Note <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
##
## Check Control Levels and Record Counts for Processed Data:
##
##
## Checking variable combination: age
## # A tibble: 6 x 4
## age n n_deaths_na deaths
## <fct> <dbl> <dbl> <dbl>
## 1 Under 25 years 12543 0 432096
## 2 25-44 years 16323 0 1118247
## 3 45-64 years 19812 0 4307809
## 4 65-74 years 19806 0 4368517
## 5 75-84 years 19813 0 5351113
## 6 85 years and older 19802 0 6752462
##
##
## Checking variable combination: period year Type
## # A tibble: 8 x 6
## period year Type n n_deaths_na deaths
## <fct> <fct> <chr> <dbl> <dbl> <dbl>
## 1 2015-2019 2015 Predicted (weighted) 14367 0 2698242
## 2 2015-2019 2016 Predicted (weighted) 14445 0 2725557
## 3 2015-2019 2017 Predicted (weighted) 14408 0 2802070
## 4 2015-2019 2018 Predicted (weighted) 14400 0 2830373
## 5 2015-2019 2019 Predicted (weighted) 14413 0 2843917
## 6 2020 2020 Predicted (weighted) 14834 0 3432816
## 7 2021 2021 Predicted (weighted) 14702 0 3450646
## 8 2022 2022 Predicted (weighted) 6530 0 1546623
##
##
## Checking variable combination: period Suppress
## # A tibble: 4 x 5
## period Suppress n n_deaths_na deaths
## <fct> <chr> <dbl> <dbl> <dbl>
## 1 2015-2019 <NA> 72033 0 13900159
## 2 2020 <NA> 14834 0 3432816
## 3 2021 <NA> 14702 0 3450646
## 4 2022 <NA> 6530 0 1546623
##
##
## Checking variable combination: period Note
## # A tibble: 9 x 5
## period Note n n_deaths_na deaths
## <fct> <chr> <dbl> <dbl> <dbl>
## 1 2015-20~ <NA> 72033 0 1.39e7
## 2 2020 Data in recent weeks are incomplete. Only ~ 279 0 8.69e4
## 3 2020 <NA> 14555 0 3.35e6
## 4 2021 Data in recent weeks are incomplete. Only ~ 13990 0 3.20e6
## 5 2021 Data in recent weeks are incomplete. Only ~ 15 0 4.01e2
## 6 2021 Data in recent weeks are incomplete. Only ~ 697 0 2.51e5
## 7 2022 Data in recent weeks are incomplete. Only ~ 1058 0 1.61e5
## 8 2022 Data in recent weeks are incomplete. Only ~ 86 0 7.94e3
## 9 2022 Data in recent weeks are incomplete. Only ~ 5386 0 1.38e6
##
## *** File has been checked for uniqueness by: cluster year week
##
## Plots will be run after excluding stateNoCheck states
##
## Detailed cluster summary PDF file is available at: ./RInputFiles/Coronavirus/Plots/CDC_cluster_2022w24.pdf
##
## Returning plot outputs to the main log file
## Joining, by = "state"
##
## Detailed age summary PDF file is available at: ./RInputFiles/Coronavirus/Plots/CDC_age_2022w24.pdf
##
## Returning plot outputs to the main log file
saveToRDS(cdcList_20220713, ovrWriteError=FALSE)
# STEP 2: Latest death bu location-cause data
allCause_220713 <- analyzeAllCause(loc="COvID_deaths_age_place_20220713.csv",
cdcDailyList=readFromRDS("cdc_daily_220704"),
compareThruDate="2022-06-30"
)
## `summarise()` has grouped output by 'State'. You can override using the `.groups` argument.
##
## States without abbreviations
## # A tibble: 2 x 10
## # Groups: State [2]
## State abb Year Month covidDeaths totalDeaths pneumoDeaths pneumoCovidDeat~
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 New Y~ <NA> 0 0 35270 174129 22877 13064
## 2 Puert~ <NA> 0 0 4459 80624 11310 3179
## # ... with 2 more variables: fluDeaths <dbl>, pnemoFluCovidDeaths <dbl>
##
## Sub-lists are identical by: asofDate, startDate, endDate, Group, State, deathPlace, Age
## # A tibble: 1,818 x 12
## asofDate startDate endDate Group State deathPlace Age name dfSub
## <date> <date> <date> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 2022-07-06 2020-10-01 2020-10-31 By Mo~ Unite~ Total - All~ 30-3~ pnem~ 205
## 2 2022-07-06 2021-10-01 2021-10-31 By Mo~ Unite~ Decedent's ~ 40-4~ pnem~ 150
## 3 2022-07-06 2020-02-01 2020-02-29 By Mo~ Unite~ Total - All~ 30-3~ pnem~ 71
## 4 2022-07-06 2021-11-01 2021-11-30 By Mo~ Unite~ Healthcare ~ 75-8~ pnem~ 139
## 5 2022-07-06 2022-04-01 2022-04-30 By Mo~ Unite~ Total - All~ All ~ fluD~ 184
## 6 2022-07-06 2020-11-01 2020-11-30 By Mo~ Unite~ Total - All~ 30-3~ pneu~ 227
## 7 2022-07-06 2021-08-01 2021-08-31 By Mo~ Unite~ Other All ~ pneu~ 627
## 8 2022-07-06 2022-06-01 2022-06-30 By Mo~ Unite~ Decedent's ~ 85 y~ pneu~ 183
## 9 2022-07-06 2020-01-01 2022-07-02 By To~ Unite~ Total - All~ 0-17~ fluD~ 50
## 10 2022-07-06 2020-01-01 2022-07-02 By To~ Unite~ Total - All~ 30-3~ fluD~ 200
## # ... with 1,808 more rows, and 3 more variables: dfTot <dbl>, delta <dbl>,
## # pct <dbl>
##
## Sub-lists are identical by: asofDate, startDate, endDate, Group, State, deathPlace, Age
## # A tibble: 0 x 12
## # ... with 12 variables: asofDate <date>, startDate <date>, endDate <date>,
## # Group <chr>, State <chr>, deathPlace <chr>, Age <chr>, name <chr>,
## # dfSub <dbl>, dfTot <dbl>, delta <dbl>, pct <dbl>
##
## Sub-lists are identical by: asofDate, startDate, endDate, Group, State, deathPlace, Age
## # A tibble: 0 x 12
## # ... with 12 variables: asofDate <date>, startDate <date>, endDate <date>,
## # Group <chr>, State <chr>, deathPlace <chr>, Age <chr>, name <chr>,
## # dfSub <dbl>, dfTot <dbl>, delta <dbl>, pct <dbl>
## # A tibble: 51 x 4
## abb cumValue tot_deaths pctdiff
## <chr> <dbl> <dbl> <dbl>
## 1 NY 36925 69007 0.303
## 2 DC 1994 1351 0.192
## 3 WY 1462 1834 0.113
## 4 ND 2802 2296 0.0993
## 5 GA 32661 38579 0.0831
## 6 NC 29438 25211 0.0773
## 7 MI 32104 36918 0.0697
## 8 NE 4986 4342 0.0690
## 9 AZ 26808 30515 0.0647
## 10 OH 44034 38852 0.0625
## # ... with 41 more rows
## # A tibble: 1 x 3
## cumValue tot_deaths pctdiff
## <dbl> <dbl> <dbl>
## 1 974598 1008140 1.91
## Warning: Removed 8 rows containing missing values (geom_col).
## Warning: Removed 8 rows containing missing values (geom_col).
saveToRDS(allCause_220713, ovrWriteError=FALSE)
# STEP 3: Facets for excess all-cause deaths
excessDeathFacets(lstCDC=cdcList_20220713, lstAll=allCause_220713, dateThru="2022-05-31", plotYLim=c(-200, 1200))
There have been issues with US all-cause deaths data since a “systems upgrade” in mid-June. How much restatement of data has occurred?
# Mapping file of epiweek and epiyear to date
mapEpi <- tibble::tibble(date=seq.Date(as.Date("2014-12-01"), as.Date("2031-01-31"), by=1)) %>%
mutate(epiYear=as.integer(lubridate::epiyear(date)), epiWeek=as.integer(lubridate::epiweek(date)))
nameFile <- "ageAgg"
dfCheck <- bind_rows(readFromRDS("cdcList_20220713")[[nameFile]],
readFromRDS("cdcList_20220623")[[nameFile]],
readFromRDS("cdcList_20220105")[[nameFile]],
.id="fileDate"
) %>%
mutate(fileDate=c("1"="2022-07-13", "2"="2022-06-23", "3"="2022-01-05")[fileDate])
mapEpi %>%
arrange(date) %>%
group_by(epiYear, epiWeek) %>%
filter(row_number()==1) %>%
ungroup() %>%
rename(yearint=epiYear, week=epiWeek) %>%
right_join(dfCheck, by=c("yearint", "week")) %>%
ggplot(aes(x=date, y=deaths)) +
geom_line(aes(color=fileDate, group=fileDate)) +
lims(y=c(0, NA)) +
labs(x=NULL, y="Reported all-cause US deaths", title="US all-cause deaths by report date") +
facet_wrap(~age, scales="free_y")
Data appear anomalous, particularly 2022 deaths in “Under 25 years” and “25-44 years”. Partly, this is incomplete reporting in the most recent weeks (normal), but partly this may be driven by data not yet re-entered after the upgrade. It is striking that there are fewer reported all-cause deaths in the 2022-07-13 data than in the 2022-06-23 data for any cohort, as all-cause data almost always increases as additional reports are received from vital statistics departments. Trends among “45-64 years” and senior citizens, at a glance, are the more commonly observed build over time
The process is converted to functional form:
makeRestatementData <- function(vecFiles, key, vecNames=NULL, epiRange=as.Date(c("2014-12-01", "2031-01-31"))) {
# FUNCTION ARGUMENTS:
# vecFiles: character vector of file names (will be extracted using readFromRDS)
# key: the extract element from each of the lists
# vecNames: names to be used in plot for each of the extracts (NULL means infer from ...)
# epiRange: range for converting epiweek and epiyear to date (should be a larger range than data)
# Create keyNames if not provided
if(is.null(vecNames)) {
vecNames <- as.character(lubridate::ymd(stringr::str_remove(vecFiles, ".*_"))) %>%
purrr::set_names(as.character(1:length(vecFiles)))
}
# Create epi mapping file
dfEpi <- tibble::tibble(date=seq.Date(epiRange[1], epiRange[2], by=1)) %>%
mutate(epiYear=as.integer(lubridate::epiyear(date)),
epiWeek=as.integer(lubridate::epiweek(date))
)
# Create single date for each epiWeek and epiYear
mapEpi <- dfEpi %>%
arrange(date) %>%
group_by(epiYear, epiWeek) %>%
filter(row_number()==1) %>%
ungroup() %>%
rename(yearint=epiYear, week=epiWeek)
# Read and integrate file, add epiDate
purrr::map_dfr(.x=vecFiles,
.f=function(x) readFromRDS(x)[[key]],
.id="fileDate"
) %>%
mutate(fileDate=vecNames[fileDate]) %>%
left_join(mapEpi, by=c("yearint", "week"))
}
makeRestatementData(c("cdcList_20220713", "cdcList_20220623", "cdcList_20220105"), key="ageAgg")
## # A tibble: 6,810 x 12
## fileDate age year week deaths weekfct yearint pred delta cumDelta
## <chr> <fct> <fct> <int> <dbl> <fct> <int> <dbl> <dbl> <dbl>
## 1 2022-07-13 Under 25 ~ 2015 1 1069 1 2015 1143. -74.4 -74.4
## 2 2022-07-13 Under 25 ~ 2016 1 1067 1 2016 1122. -55.0 -55.0
## 3 2022-07-13 Under 25 ~ 2017 1 1147 1 2017 1101. 46.4 46.4
## 4 2022-07-13 Under 25 ~ 2018 1 1185 1 2018 1079. 106. 106.
## 5 2022-07-13 Under 25 ~ 2019 1 1035 1 2019 1058. -22.8 -22.8
## 6 2022-07-13 Under 25 ~ 2020 1 1101 1 2020 1036. 64.6 64.6
## 7 2022-07-13 Under 25 ~ 2021 1 1072 1 2021 1015. 57.0 57.0
## 8 2022-07-13 Under 25 ~ 2022 1 931 1 2022 994. -62.6 -62.6
## 9 2022-07-13 Under 25 ~ 2015 2 1103 2 2015 1133. -30.0 -104.
## 10 2022-07-13 Under 25 ~ 2016 2 1068 2 2016 1112. -43.6 -98.6
## # ... with 6,800 more rows, and 2 more variables: cumPred <dbl>, date <date>
makeRestatementData(c("cdcList_20220713", "cdcList_20220623", "cdcList_20220105"), key="stateAgg")
## # A tibble: 57,156 x 12
## fileDate state year week deaths weekfct yearint pred delta cumDelta
## <chr> <fct> <fct> <int> <dbl> <fct> <int> <dbl> <dbl> <dbl>
## 1 2022-07-13 AK 2015 1 62 1 2015 69.9 -7.90 -7.90
## 2 2022-07-13 AK 2016 1 60 1 2016 71.4 -11.4 -11.4
## 3 2022-07-13 AK 2017 1 85 1 2017 73 12.0 12.0
## 4 2022-07-13 AK 2018 1 71 1 2018 74.6 -3.55 -3.55
## 5 2022-07-13 AK 2019 1 87 1 2019 76.1 10.9 10.9
## 6 2022-07-13 AK 2020 1 77 1 2020 77.7 -0.656 -0.656
## 7 2022-07-13 AK 2021 1 101 1 2021 79.2 21.8 21.8
## 8 2022-07-13 AK 2022 1 102 1 2022 80.8 21.2 21.2
## 9 2022-07-13 AK 2015 2 77 2 2015 73.3 3.70 -4.19
## 10 2022-07-13 AK 2016 2 65 2 2016 74.8 -9.85 -21.3
## # ... with 57,146 more rows, and 2 more variables: cumPred <dbl>, date <date>
makeRestatementData(c("cdcList_20220713", "cdcList_20220623", "cdcList_20220105"), key="clusterAgg")
## # A tibble: 7,945 x 14
## fileDate cluster year week deaths weekfct yearint pred delta cumDelta
## <chr> <chr> <fct> <dbl> <dbl> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 2022-07-13 4 2015 1 5966 1 2015 5539. 427. 427.
## 2 2022-07-13 4 2016 1 5174 1 2016 5624. -450. -450.
## 3 2022-07-13 4 2017 1 5683 1 2017 5708. -25.4 -25.4
## 4 2022-07-13 4 2018 1 6163 1 2018 5793. 370. 370.
## 5 2022-07-13 4 2019 1 5556 1 2019 5878. -322. -322.
## 6 2022-07-13 4 2020 1 5729 1 2020 5962. -233. -233.
## 7 2022-07-13 4 2021 1 7176 1 2021 6047. 1129. 1129.
## 8 2022-07-13 4 2022 1 7651 1 2022 6132. 1519. 1519.
## 9 2022-07-13 4 2015 2 5650 2 2015 5478. 172. 599.
## 10 2022-07-13 4 2016 2 5195 2 2016 5563. -368. -818.
## # ... with 7,935 more rows, and 4 more variables: cumPred <dbl>, pop <dbl>,
## # cvDeaths <dbl>, date <date>
makeRestatementData(c("cdcList_20220713", "cdcList_20220623", "cdcList_20220105"), key="allUSAgg")
## # A tibble: 1,135 x 11
## fileDate year week deaths weekfct yearint pred delta cumDelta cumPred
## <chr> <fct> <int> <dbl> <fct> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2022-07-13 2015 1 61700 1 2015 58816. 2884. 2884. 58816.
## 2 2022-07-13 2016 1 55844 1 2016 59578. -3734. -3734. 59578.
## 3 2022-07-13 2017 1 59714 1 2017 60340. -626. -626. 60340.
## 4 2022-07-13 2018 1 66153 1 2018 61101. 5052. 5052. 61101.
## 5 2022-07-13 2019 1 58287 1 2019 61863. -3576. -3576. 61863.
## 6 2022-07-13 2020 1 60034 1 2020 62625. -2591. -2591. 62625.
## 7 2022-07-13 2021 1 86923 1 2021 63387. 23536. 23536. 63387.
## 8 2022-07-13 2022 1 79507 1 2022 64149. 15358. 15358. 64149.
## 9 2022-07-13 2015 2 61072 2 2015 59164. 1908. 4792. 117980.
## 10 2022-07-13 2016 2 55525 2 2016 59926. -4401. -8135. 119504.
## # ... with 1,125 more rows, and 1 more variable: date <date>